/*
* Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.thesmartweb.swebrank;
import java.io.File;
import java.io.IOException;
import java.nio.charset.IllegalCharsetNameException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Class related to the parsing procedures of HTML files by our mechanism
* @author Themis Mavridis
*/
public class WebParser {
/**
* The number of embeded videos
*/
public int number_embeded_videos;
/**
* The number of embeded videos that are internal to the domain links
*/
public int number_embeded_videos_internal;
/**
* The number of scripts
*/
public int scripts_number;
/**
* The number of scripts that are internal
*/
public int scripts_internal;
/**
* The number of frames
*/
public int frames_number;
/**
* The number of internal frames
*/
public int frames_internal;
/**
* The number of links
*/
public int links_number;
/**
* The number of internal links
*/
public int links_internal;
/**
* The number of schema.org usages
*/
public int nschem;
/**
* The number of hcards
*/
public int hcardsn;
/**
* hcalendars
*/
public int hcalen;
/**
* hreviews
*/
public int hrevn;
/**
* hevents
*/
public int hevenn;
/**
*hadresses
*/
public int haddrn;
/**
* hgeo
*/
public int hgeon;
/**
*rel tags
*/
public int hreln;
/**
*total microformats
*/
public int total_micron;
/**
*microformats-1
*/
public int micron1;
/**
*microformats-2
*/
public int micron2;
/**
*microdata
*/
public int microd;
/**
*number of foaf
*/
public int foaf;
/**
* Get the text content of a url cleaned from stopwords and symbols and lemmatized
* @param html_string the url to parse
* @return the content in a string
*/
public String Parse(String html_string){
String content;
content=cleanhtml(html_string);
if(content!=null){
DataManipulation txtpro = new DataManipulation();
Stopwords st = new Stopwords();
content=txtpro.removeChars(content);
content=st.stop(content);
content=txtpro.removeChars(content);
//List<String> contentList = Arrays.asList(content.split(" "));
//StemmerSnow snowballstemmer = new StemmerSnow();
//contentList=snowballstemmer.stem(contentList);
//for(String contentListItem : contentList){
// content=content+" "+contentListItem;
//}
Lemmatizer lemmatizer = new Lemmatizer();
List<String> contentList=lemmatizer.lemmatize(content);
content="";
for(String contentListItem : contentList){
content=content+" "+contentListItem;
}
}
return content;
}
/**
* Parse the url and get all the content
* @param link_html the url to parse
* @return The content parsed
*/
public String cleanhtml(String link_html)
{
try {
Document doc = Jsoup.connect(link_html).timeout(10*1000).get();
String title = doc.title();
String mainbody = doc.body().text();
Elements links = doc.select("a[href]");
Elements media = doc.select("[src]");
//fix link html to remove https:// or http:// and simple /
if(link_html.substring(link_html.length()-1,link_html.length()).equalsIgnoreCase("/")){link_html=link_html.substring(0,link_html.length()-1);}
if(link_html.substring(0,5).equalsIgnoreCase("https")){
link_html=link_html.substring(8);
}else if(link_html.substring(0,4).equalsIgnoreCase("http")){
link_html=link_html.substring(7);
}
String anchortext = "";
String alttext="";
//-----get the anchor text of internal links
for (Element link : links) {
String str_check=link.attr("abs:href").toString();
if (link.attr("abs:href").contains(link_html) && link.text().length() > 1) {
anchortext = anchortext + link.text() + " ";
}
}
//-------get alt text to internal images links
for (Element medi : media) {
if(medi.getElementsByTag("img").attr("src").toString().contains(link_html)){
alttext=alttext+" "+medi.getElementsByTag("img").attr("alt").toString();
}
if(medi.getElementsByTag("img").attr("src").toString().startsWith("/")){
alttext=alttext+" "+medi.getElementsByTag("img").attr("alt").toString();
}
}
String content = mainbody + title + anchortext+alttext;
return content;
} catch (IOException ex) {
Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
String check=null;
return check;
}
catch (NullPointerException ex) {
Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
String check=null;
return check;
}
catch (Exception ex) {
Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
String check=null;
return check;
}
}
/**
* Method to get the number of links (total, internal)
* @param link_html the url to parse
* @return the number of links
*/
public int[] getnlinks(String link_html){
int[] nlinks= new int[2];
nlinks[0]=0;//total number of links
nlinks[1]=0;//number of internal links
try {
Document doc = Jsoup.connect(link_html).timeout(10*1000).get();
Elements links = doc.select("a[href]");
nlinks[0]=links.size();
//----we check if a link is internal or not (abs is used to get the whole link (abs stands for abs)
for (Element link : links) {
if (link.attr("abs:href").contains(link_html)) {nlinks[1]++;}
}
return nlinks;
} catch (Exception ex) {
Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
return nlinks;
}
}
/**
* Method to get the various html stats
* @param link_html the url to analyze
* @return flag if we got all the stats
*/
public boolean gethtmlstats(String link_html){
try {
Document doc = Jsoup.connect(link_html).timeout(10*1000).get();
Elements schemas=doc.getElementsByAttributeValueContaining("itemtype", "schema.org");
Elements microdata=doc.getElementsByAttribute("itemtype");
Elements microformats_vcard=doc.getElementsByAttributeValueContaining("class", "vcard");
Elements microformats_hreview=doc.getElementsByAttributeValueContaining("class", "hreview");
Elements microformats_vevent=doc.getElementsByAttributeValueContaining("class", "vevent");
Elements microformats_vcalendar=doc.getElementsByAttributeValueContaining("class", "vcalendar");
Elements microformats_vgeo=doc.getElementsByAttributeValueContaining("class", "geo");
Elements microformats_vadrn=doc.getElementsByAttributeValueContaining("class", "ardn");
Elements microformats_acquaintance=doc.getElementsByAttributeValueContaining("rel", "link_html");
Elements microformats_alternate=doc.getElementsByAttributeValueContaining("rel", "alternate");
Elements microformats_appendix=doc.getElementsByAttributeValueContaining("rel", "appendix");
Elements microformats_bookmark=doc.getElementsByAttributeValueContaining("rel", "bookmark");
Elements microformats_chapter=doc.getElementsByAttributeValueContaining("rel", "chapter");
Elements microformats_child=doc.getElementsByAttributeValueContaining("rel", "child");
Elements microformats_coll=doc.getElementsByAttributeValueContaining("rel", "colleague");
Elements microformats_contact=doc.getElementsByAttributeValueContaining("rel", "contact");
Elements microformats_contents=doc.getElementsByAttributeValueContaining("rel", "contents");
Elements microformats_copyright=doc.getElementsByAttributeValueContaining("rel", "copyright");
Elements microformats_coresident=doc.getElementsByAttributeValueContaining("rel", "co-resident");
Elements microformats_coworker=doc.getElementsByAttributeValueContaining("rel", "co-worker");
Elements microformats_crush=doc.getElementsByAttributeValueContaining("rel", "crush");
Elements microformats_date=doc.getElementsByAttributeValueContaining("rel", "date");
Elements microformats_friend=doc.getElementsByAttributeValueContaining("rel", "friend");
Elements microformats_glossary=doc.getElementsByAttributeValueContaining("rel", "glossary");
Elements microformats_help=doc.getElementsByAttributeValueContaining("rel", "help");
Elements microformats_itsrules=doc.getElementsByAttributeValueContaining("rel", "its-rules");
Elements microformats_kin=doc.getElementsByAttributeValueContaining("rel", "kin");
Elements microformats_license=doc.getElementsByAttributeValueContaining("rel", "license");
Elements microformats_me=doc.getElementsByAttributeValueContaining("rel", "me");
Elements microformats_met=doc.getElementsByAttributeValueContaining("rel", "met");
Elements microformats_muse=doc.getElementsByAttributeValueContaining("rel", "muse");
Elements microformats_neighbor=doc.getElementsByAttributeValueContaining("rel", "neighbor");
Elements microformats_next=doc.getElementsByAttributeValueContaining("rel", "next");
Elements microformats_nofollow=doc.getElementsByAttributeValueContaining("rel", "nofollow");
Elements microformats_parent=doc.getElementsByAttributeValueContaining("rel", "parent");
Elements microformats_prev=doc.getElementsByAttributeValueContaining("rel", "prev");
Elements microformats_previous=doc.getElementsByAttributeValueContaining("rel", "previous");
Elements microformats_section=doc.getElementsByAttributeValueContaining("rel", "section");
Elements microformats_sibling=doc.getElementsByAttributeValueContaining("rel", "sibling");
Elements microformats_spouse=doc.getElementsByAttributeValueContaining("rel", "spouse");
Elements microformats_start=doc.getElementsByAttributeValueContaining("rel", "start");
Elements microformats_stylesheet=doc.getElementsByAttributeValueContaining("rel", "stylesheet");
Elements microformats_subsection=doc.getElementsByAttributeValueContaining("rel", "subsection");
Elements microformats_sweetheart=doc.getElementsByAttributeValueContaining("rel", "sweetheart");
Elements microformats_tag=doc.getElementsByAttributeValueContaining("rel", "tag");
Elements microformats_toc=doc.getElementsByAttributeValueContaining("rel", "toc");
Elements microformats_transformation=doc.getElementsByAttributeValueContaining("rel", "transformation");
Elements microformats_appleti=doc.getElementsByAttributeValueContaining("rel", "apple-touch-icon");
Elements microformats_appletip=doc.getElementsByAttributeValueContaining("rel", "apple-touch-icon-precomposed");
Elements microformats_appletsi=doc.getElementsByAttributeValueContaining("rel", "apple-touch-startup-image");
Elements microformats_attachment=doc.getElementsByAttributeValueContaining("rel", "attachment");
Elements microformats_can=doc.getElementsByAttributeValueContaining("rel", "canonical");
Elements microformats_categ=doc.getElementsByAttributeValueContaining("rel", "category");
Elements microformats_compon=doc.getElementsByAttributeValueContaining("rel", "component");
Elements microformats_chromewebi=doc.getElementsByAttributeValueContaining("rel", "chrome-webstore-item");
Elements microformats_disclosure=doc.getElementsByAttributeValueContaining("rel", "disclosure");
Elements microformats_discussion=doc.getElementsByAttributeValueContaining("rel", "discussion");
Elements microformats_dns=doc.getElementsByAttributeValueContaining("rel", "dns-prefetch");
Elements microformats_edit=doc.getElementsByAttributeValueContaining("rel", "edit");
Elements microformats_edituri=doc.getElementsByAttributeValueContaining("rel", "EditURI");
Elements microformats_entrycon=doc.getElementsByAttributeValueContaining("rel", "entry-content");
Elements microformats_external=doc.getElementsByAttributeValueContaining("rel", "external");
Elements microformats_home=doc.getElementsByAttributeValueContaining("rel", "home");
Elements microformats_hub=doc.getElementsByAttributeValueContaining("rel", "hub");
Elements microformats_inreplyto=doc.getElementsByAttributeValueContaining("rel", "in-reply-to");
Elements microformats_index=doc.getElementsByAttributeValueContaining("rel", "index");
Elements microformats_indieauth=doc.getElementsByAttributeValueContaining("rel", "indieauth");
Elements microformats_issues=doc.getElementsByAttributeValueContaining("rel", "issues");
Elements microformats_lightbox=doc.getElementsByAttributeValueContaining("rel", "lightbox");
Elements microformats_meta=doc.getElementsByAttributeValueContaining("rel", "meta");
Elements microformats_openid=doc.getElementsByAttributeValueContaining("rel", "opendid");
Elements microformats_p3pv1=doc.getElementsByAttributeValueContaining("rel", "p3pv1");
Elements microformats_pgpkey=doc.getElementsByAttributeValueContaining("rel", "pgpkey");
Elements microformats_pingback=doc.getElementsByAttributeValueContaining("rel", "pingback");
Elements microformats_prerender=doc.getElementsByAttributeValueContaining("rel", "prerender");
Elements microformats_profile=doc.getElementsByAttributeValueContaining("rel", "profile");
Elements microformats_rendition=doc.getElementsByAttributeValueContaining("rel", "rendition");
Elements microformats_service=doc.getElementsByAttributeValueContaining("rel", "service");
Elements microformats_shortlink=doc.getElementsByAttributeValueContaining("rel", "shortlink");
Elements microformats_sidebar=doc.getElementsByAttributeValueContaining("rel", "sidebar");
Elements microformats_sitemap=doc.getElementsByAttributeValueContaining("rel", "sitemap");
Elements microformats_subresource=doc.getElementsByAttributeValueContaining("rel", "subresource");
Elements microformats_syndication=doc.getElementsByAttributeValueContaining("rel", "syndication");
Elements microformats_timesheet=doc.getElementsByAttributeValueContaining("rel", "timesheet");
Elements microformats_webmention=doc.getElementsByAttributeValueContaining("rel", "webmention");
Elements microformats_widget=doc.getElementsByAttributeValueContaining("rel", "widget");
Elements microformats_wlwmanifest=doc.getElementsByAttributeValueContaining("rel", "wlwmanifest");
Elements microformats_imgsrc=doc.getElementsByAttributeValueContaining("rel", "image_src");
Elements microformats_cmisacl=doc.getElementsByAttributeValueContaining("rel", "http://docs.oasis-open.org/ns/cmis/link/200908/acl");
Elements microformats_stylesheetless=doc.getElementsByAttributeValueContaining("rel", "stylesheet/less");
Elements microformats_accessibility=doc.getElementsByAttributeValueContaining("rel", "accessibility");
Elements microformats_biblio=doc.getElementsByAttributeValueContaining("rel", "bibliography");
Elements microformats_cite=doc.getElementsByAttributeValueContaining("rel", "cite");
Elements microformats_group=doc.getElementsByAttributeValueContaining("rel", "group");
Elements microformats_jslicence=doc.getElementsByAttributeValueContaining("rel", "jslicense");
Elements microformats_longdesc=doc.getElementsByAttributeValueContaining("rel", "longdesc");
Elements microformats_map=doc.getElementsByAttributeValueContaining("rel", "map");
Elements microformats_member=doc.getElementsByAttributeValueContaining("rel", "member");
Elements microformats_source=doc.getElementsByAttributeValueContaining("rel", "source");
Elements microformats_status=doc.getElementsByAttributeValueContaining("rel", "status");
Elements microformats_archive=doc.getElementsByAttributeValueContaining("rel", "archive");
Elements microformats_archives=doc.getElementsByAttributeValueContaining("rel", "archives");
Elements microformats_comment=doc.getElementsByAttributeValueContaining("rel", "comment");
Elements microformats_contribution=doc.getElementsByAttributeValueContaining("rel", "contribution");
Elements microformats_endorsed=doc.getElementsByAttributeValueContaining("rel", "endorsed");
Elements microformats_fan=doc.getElementsByAttributeValueContaining("rel", "fan");
Elements microformats_feed=doc.getElementsByAttributeValueContaining("rel", "feed");
Elements microformats_footnote=doc.getElementsByAttributeValueContaining("rel", "footnote");
Elements microformats_icon=doc.getElementsByAttributeValueContaining("rel", "icon");
Elements microformats_kinstyle=doc.getElementsByAttributeValueContaining("rel", "kinetic-stylesheet");
Elements microformats_prettyphoto=doc.getElementsByAttributeValueContaining("rel", "prettyPhoto");
Elements microformats_clearbox=doc.getElementsByAttributeValueContaining("rel", "clearbox");
Elements microformats_made=doc.getElementsByAttributeValueContaining("rel", "made");
Elements microformats_microsummary=doc.getElementsByAttributeValueContaining("rel", "microsummary");
Elements microformats_noreferrer=doc.getElementsByAttributeValueContaining("rel", "noreferrer");
Elements microformats_permalink=doc.getElementsByAttributeValueContaining("rel", "permalink");
Elements microformats_popover=doc.getElementsByAttributeValueContaining("rel", "popover");
Elements microformats_prefetch=doc.getElementsByAttributeValueContaining("rel", "prefetch");
Elements microformats_publickey=doc.getElementsByAttributeValueContaining("rel", "publickey");
Elements microformats_publisher=doc.getElementsByAttributeValueContaining("rel", "publisher");
Elements microformats_referral=doc.getElementsByAttributeValueContaining("rel", "referral");
Elements microformats_related=doc.getElementsByAttributeValueContaining("rel", "related");
Elements microformats_replies=doc.getElementsByAttributeValueContaining("rel", "replies");
Elements microformats_resource=doc.getElementsByAttributeValueContaining("rel", "resource");
Elements microformats_search=doc.getElementsByAttributeValueContaining("rel", "search");
Elements microformats_sponsor=doc.getElementsByAttributeValueContaining("rel", "sponsor");
Elements microformats_tooltip=doc.getElementsByAttributeValueContaining("rel", "tooltip");
Elements microformats_trackback=doc.getElementsByAttributeValueContaining("rel", "trackback");
Elements microformats_unendorsed=doc.getElementsByAttributeValueContaining("rel", "unendorsed");
Elements microformats_user=doc.getElementsByAttributeValueContaining("rel", "user");
Elements microformats_wlw=doc.getElementsByAttributeValueContaining("rel", "wlwmanifest");
//-----microformats2
Elements microformats2_hadr=doc.getElementsByAttributeValueContaining("class", "h-adr");
Elements microformats2_hcard=doc.getElementsByAttributeValueContaining("class", "h-card");
Elements microformats2_hentry=doc.getElementsByAttributeValueContaining("class", "h-entry");
Elements microformats2_hevent=doc.getElementsByAttributeValueContaining("class", "h-event");
Elements microformats2_hgeo=doc.getElementsByAttributeValueContaining("class", "h-geo");
Elements microformats2_hitem=doc.getElementsByAttributeValueContaining("class", "h-item");
Elements microformats2_hproduct=doc.getElementsByAttributeValueContaining("class", "h-product");
Elements microformats2_hrecipe=doc.getElementsByAttributeValueContaining("class", "h-recipe");
Elements microformats2_hresume=doc.getElementsByAttributeValueContaining("class", "h-resume");
Elements microformats2_hreview=doc.getElementsByAttributeValueContaining("class", "h-review");
Elements microformats2_hreviewagg=doc.getElementsByAttributeValueContaining("class", "h-review-aggregate");
Elements foaf_autodiscoveries=doc.getElementsByAttributeValueContaining("href", "foaf");
Elements foaf_types=doc.getElementsByAttributeValueContaining("type", "foaf");
Elements media = doc.select("embed");
Elements iframes = doc.select("iframe");
Elements script_el=doc.select("script");
Elements reltags=doc.select("link[rel]");
Elements reltags_a=doc.select("a[rel]");
number_embeded_videos=media.size();
scripts_number=script_el.size();
frames_number=iframes.size();
nschem=schemas.size();
hreln=reltags.size()+reltags_a.size();
foaf=foaf_autodiscoveries.size()+foaf_types.size();
micron1=microformats_cmisacl.size()+microformats_vcard.size()+microformats_vevent.size()+microformats_hreview.size()+microformats_vgeo.size()+microformats_vcalendar.size()+microformats_vadrn.size()+microformats_acquaintance.size()+microformats_alternate.size()+microformats_appendix.size()+ microformats_biblio.size()+microformats_bookmark.size()+microformats_chapter.size()+ microformats_child.size()+microformats_coll.size()+microformats_contact.size()+microformats_contents.size()+microformats_copyright.size()+microformats_coresident.size()+microformats_coworker.size()+microformats_crush.size()+microformats_date.size()+microformats_friend.size()+microformats_glossary.size()+microformats_help.size()+microformats_itsrules.size()+microformats_kin.size()+microformats_license.size()+microformats_me.size()+microformats_met.size()+microformats_muse.size()+microformats_neighbor.size()+microformats_next.size()+microformats_nofollow.size()+microformats_parent.size()+microformats_prev.size()+microformats_previous.size()+microformats_section.size()+microformats_sibling.size()+microformats_spouse.size()+microformats_start.size()+microformats_stylesheet.size()+microformats_subsection.size()+microformats_sweetheart.size()+microformats_tag.size()+microformats_toc.size()+microformats_transformation.size()+microformats_appleti.size()+microformats_appletip.size()+microformats_appletsi.size()+microformats_attachment.size()+microformats_can.size()+microformats_categ.size()+microformats_compon.size()+microformats_chromewebi.size()+microformats_disclosure.size()+microformats_discussion.size()+microformats_dns.size()+microformats_edit.size()+microformats_edituri.size()+microformats_entrycon.size()+microformats_external.size()+microformats_home.size()+microformats_hub.size()+microformats_inreplyto.size()+microformats_index.size()+microformats_indieauth.size()+microformats_issues.size()+microformats_lightbox.size()+microformats_meta.size()+microformats_openid.size()+microformats_p3pv1.size()+microformats_pgpkey.size()+microformats_pingback.size()+microformats_prerender.size()+microformats_profile.size()+microformats_rendition.size()+microformats_service.size()+microformats_shortlink.size()+microformats_sidebar.size()+microformats_sitemap.size()+microformats_subresource.size()+microformats_syndication.size()+microformats_timesheet.size()+ microformats_webmention.size()+microformats_widget.size()+microformats_wlwmanifest.size()+microformats_imgsrc.size()+microformats_imgsrc.size()+microformats_stylesheetless.size()+microformats_accessibility.size()+microformats_accessibility.size()+microformats_cite.size()+microformats_group.size()+ microformats_jslicence.size()+microformats_longdesc.size()+microformats_map.size()+microformats_member.size()+microformats_source.size()+ microformats_status.size()+microformats_archive.size()+microformats_archives.size()+microformats_comment.size()+microformats_contribution.size()+microformats_endorsed.size()+microformats_fan.size()+microformats_feed.size()+microformats_footnote.size()+microformats_icon.size()+microformats_kinstyle.size()+microformats_prettyphoto.size()+microformats_clearbox.size()+microformats_made.size()+microformats_microsummary.size()+microformats_noreferrer.size()+microformats_permalink.size()+microformats_popover.size()+microformats_prefetch.size()+microformats_publickey.size()+microformats_publisher.size()+microformats_referral.size()+microformats_related.size()+microformats_replies.size()+microformats_resource.size()+microformats_search.size()+microformats_sponsor.size()+microformats_tooltip.size()+microformats_trackback.size()+microformats_unendorsed.size()+microformats_user.size()+microformats_wlw.size()+foaf;
micron2=microformats2_hadr.size()+microformats2_hcard.size()+microformats2_hentry.size()+microformats2_hevent.size()+microformats2_hgeo.size()+microformats2_hitem.size()+microformats2_hproduct.size()+microformats2_hrecipe.size()+microformats2_hresume.size()+microformats2_hreview.size()+microformats2_hreviewagg.size();
total_micron=micron1+micron2;
microd=microdata.size();
return true;
} catch (IOException | IllegalCharsetNameException ex) {
Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
return false;
}
}
/**
* Method to get all the elements with a specific html feature (not used in SWebRank's current version)
* @param link_html the url to check
* @param dir the directory to save the file
* @return a list with the text of all the elements
*/
public List<String> getbold(String link_html,String dir)
{ List<String> SEwords=new ArrayList<String>();
try {
//link_html="http://www.themismavridis.com/";
Document doc = Jsoup.connect(link_html).get();
//---------to select the rest of the terms
Elements bold= doc.select("em");
//bold=bold.select("b");
for (Element btext : bold) {
String stringtosplit = btext.text().toString().toString();
if(!(stringtosplit==null)&&(!(stringtosplit.equalsIgnoreCase("")))){
stringtosplit=stringtosplit.replaceAll("[\\W&&[^\\s]]", "");
if(!(stringtosplit==null)&&(!(stringtosplit.equalsIgnoreCase("")))){
String[] tokenizedTerms=stringtosplit.split("\\W+");
for(int j=0;j<tokenizedTerms.length;j++){
if(!(tokenizedTerms[j]==null)&&(!(tokenizedTerms[j].equalsIgnoreCase("")))){
SEwords.add(tokenizedTerms[j]);
}
}
}
}
}
File file_thelist = new File(dir+"Javawords.txt");
FileUtils.writeLines(file_thelist, SEwords);
return SEwords;
} catch (IOException ex) {
Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
System.out.print("can not create the content file for SEwords");
return SEwords;
}
}
/**
* Method to check if we can connect with JSOUP to a specific url
* @param link_html the url to connect
* @return true/false
*/
public boolean checkconn(String link_html){
try {
Connection.Response response = Jsoup.connect(link_html).timeout(10*1000).execute();
return response.statusCode() == 200;
} catch (Exception ex) {
Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
System.out.print("can not connect to:"+link_html);
return false;
}
}
}